# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("scran")
suppressPackageStartupMessages({
library(Seurat)
library(ggplot2) # plotting
library(patchwork) # combining figures
library(dplyr)
library(xlsx)
})data.filt = readRDS("results/data.filt.RDS")
data.filt## An object of class Seurat
## 33515 features across 1022 samples within 1 assay
## Active assay: RNA (33515 features, 2000 variable features)
## 3 layers present: counts, data, scale.data
## 2 dimensional reductions calculated: pca, umap
# selection.method = "vst": i.e. seurat_v3.
suppressWarnings(suppressMessages(data.filt <- FindVariableFeatures(data.filt, selection.method = "vst",
nfeatures = 2000, verbose = FALSE, assay = "RNA")))
top20 <- head(VariableFeatures(data.filt), 20)
LabelPoints(plot = VariableFeaturePlot(data.filt), points = top20, repel = TRUE)Scale the data, here, we can regress out some unwanted variation, but first we need to identify those unwanted variation
data.filt <- ScaleData(data.filt, assay = "RNA")
data.filt <- RunPCA(data.filt, npcs = 50, verbose = F)
## $Plot contribution of metadata to each PCs
print("Plot contribution of metadata to each PCs...")
dummy = SingleCellExperiment::SingleCellExperiment(
list(pc_space=t(data.filt@reductions$pca@cell.embeddings[, 1:10])),
colData=data.filt@meta.data[, c("nCount_RNA", "nFeature_RNA", "percent_mito",
"percent_ribo", "S.Score", "G2M.Score", "Phase")])
explan_pcs = scater::getVarianceExplained(dummy, exprs_values="pc_space")
scater::plotExplanatoryPCs(explan_pcs/100)## [1] "Plot contribution of metadata to each PCs..."
From the plot above, we can see that, nFeature_RNA, nCount_RNA, percent_ribo, Phase, S.Score, percent.mit, G2M.Score explain more than 1% of the varaince, we need to regress them out duing scaling
Scaling and regress out unwanted variation at the same time
data.filt <- ScaleData(data.filt, assay = "RNA", vars.to.regress = c("nFeature_RNA", "S.Score", "G2M.Score", "percent.mit"))data.filt <- RunPCA(data.filt, npcs = 50, verbose = F)
# run ?RunPCA to check for more parameters
# By default, RunPCA will use the highly variable features for building PCAwrap_plots(
DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 1:2),
DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 3:4),
DimPlot(data.filt, reduction = "pca", group.by = "orig.ident", dims = 5:6),
ncol = 3
) + plot_layout(guides = "collect")# Visualize the gene loadings on each PCs.
VizDimLoadings(data.filt, dims = 1:5, reduction = "pca", ncol = 5, balanced = T)Mainly the top PCs contain useful information, the rest of the PCs (principal components) may mostly contain random noises. In Seurat, the tSNE and UMAP analyses below are based on the PCA result, we will only select the top significant PCs and feed them to tSNE and UMAP.
How do we select the top PCs?
A common method for determining the number of PCs to be retained is a graphical representation known as an elbow plot. An elbow plot is a simple line segment plot that shows the eigenvalues (i.e. variability explained) for each individual PC. It shows the eigenvalues on the y-axis and the number of PCs on the x-axis. It always displays a downward curve. Most elbow plots look broadly similar in shape, starting high on the left, falling rather quickly, and then flattening out at some point. This is because the first component usually explains much of the variability, the next few components explain a moderate amount, and the latter components only explain a small fraction of the overall variability. The elbow plot criterion looks for the “elbow” in the curve and selects all components just before the line flattens out.
ElbowPlot(data.filt, reduction = "pca", ndims = 50)Here, 10 looks a good number to keep, but we usually keep a bit more than that to make sure we get most of the useful information, Let’s keep 15 instead.
data.filt <- RunTSNE(
data.filt,
reduction = "pca", dims = 1:15,
perplexity = 30, # low perplexity: finer structure
max_iter = 1000,
theta = 0.5,
eta = 200,
num_threads = 0
)
# run ?RunTSNE to see how to adjust parameters.DimPlot(data.filt, reduction = "tsne", group.by = "orig.ident",
pt.size=0.1)
# run ?DimPlot to see how to adjust parameters. What does each dot represent in the sSNE plot?
data.filt <- RunUMAP(
data.filt,
reduction = "pca",
dims = 1:15,
n.components = 2,
n.neighbors = 30,
n.epochs = 200,
min.dist = 0.3,
learning.rate = 1,
spread = 1
)
# Run ?RunUMAP to see how to adjust the parameters
# Larger values will result in more global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
# min_dist: The minimum distance between two points in the UMAP embedding.
# spread: A scaling factor for distance between embedded points.DimPlot(data.filt, reduction = "umap", group.by = "orig.ident") + ggplot2::ggtitle(label = "UMAP_on_PCA")How different it is between a tSNE plot and a UMAP plot?
Let’s plot all three dimensional reduciton plots together
wrap_plots(
DimPlot(data.filt, reduction = "pca", group.by = "orig.ident"),
DimPlot(data.filt, reduction = "tsne", group.by = "orig.ident"),
DimPlot(data.filt, reduction = "umap", group.by = "orig.ident"),
ncol = 3
) + plot_layout(guides = "collect")Apart from that, UMAP can also be built on the selected highly variable genes and on Graph.
| Markers | Cell Type |
|---|---|
| CD3E | T cells |
| CD3E CD4 | CD4+ T cells |
| CD3E CD8A | CD8+ T cells |
| GNLY, NKG7 | NK cells |
| MS4A1 | B cells |
| CD14, LYZ, CST3, MS4A7 | CD14+ Monocytes |
| FCGR3A, LYZ, CST3, MS4A7 | FCGR3A+ Monocytes |
| FCER1A, CST3 | DCs |
Plot the marker genes on the UMAP plot:
myfeatures <- c("CD3E", "CD4", "CD8A", "NKG7", "GNLY",
"MS4A1", "CD14", "LYZ", "MS4A7", "FCGR3A", "CST3", "FCER1A")
FeaturePlot(data.filt, reduction = "umap", dims = 1:2,
features = myfeatures, ncol = 4, order = T) +
NoLegend() + NoAxes() + NoGrid()Based on the marker gene distribution, can you guess approximately (using manual annotation) what cell types the cells belong to.
For demonstration, here, we will only run one
cell&reference-based automatic annotation, called label transfer
(From Seurat). But, instead of CCA, which is the default for the
FindTransferAnchors() function, we will use
pcaproject, ie; the query dataset is projected onto the
RPCA of the reference dataset. Then, the labels of the reference data
will be transfered to sufficiently similar ones in the query
dataset.
CCA-based integration therefore enables integrative analysis when experimental conditions or disease states introduce very strong expression shifts, or when integrating datasets across modalities and species. However, CCA-based integration may also lead to overcorrection, especially when a large proportion of cells are non-overlapping across datasets.
RPCA-based integration runs significantly faster, and also represents a more conservative approach where cells in different biological states are less likely to ‘align’ after integration. We therefore recommend RPCA during integrative analysis where:
A substantial fraction of cells in one dataset have no matching type in the other
Datasets originate from the same platform (i.e. multiple lanes of 10x genomics)
There are a large number of datasets or cells to integrate (see here for more tips on integrating large datasets)
First, download the reference data
# devtools::install_github("immunogenomics/harmony")
# devtools::install_github("powellgenomicslab/scPred")
reference <- scPred::pbmc_1
reference## An object of class Seurat
## 32838 features across 3500 samples within 1 assay
## Active assay: RNA (32838 features, 0 variable features)
## 2 layers present: counts, data
Here, we will run all the steps that we did in previous labs in one
go with the pipe-operator %>%.
reference <- reference %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA(verbose = F) %>%
RunUMAP(dims = 1:15)transfer.anchors <- FindTransferAnchors(
reference = reference, query = data.filt,
dims = 1:15
)
predictions <- TransferData(
anchorset = transfer.anchors, refdata = reference$cell_type,
dims = 1:15
)
data.filt <- AddMetaData(object = data.filt, metadata = predictions)wrap_plots(
DimPlot(reference, reduction = "umap", group.by = "cell_type"),
DimPlot(data.filt, reduction = "umap", group.by = "predicted.id"),
ncol = 2
) + plot_layout(guides = "collect")For the two figures above, the one to the left is the reference dataset, while the one to the right is the predicted lables in the query dataset. Do you think they more or less agree with your preliminary manual annotation?
For other automatic annotation methods, please see my lectures.
# First we need to build SNN Graph
data.filt <- FindNeighbors(data.filt,
reduction = "pca",
assay = "RNA",
k.param = 20,
features = VariableFeatures(data.filt)
)
# SNN was built based on pca as well as the highly variable features.
# run ?FindNeighbors to see how to adjust the parameters
# Clustering with louvain (algorithm 1) and a few different resolutions
for (res in c(0.1, 0.25, .5, 1, 1.5, 2, 2.5, 3, 3.5)) {
data.filt <- FindClusters(data.filt, graph.name = "RNA_snn", resolution = res, algorithm = 1)
}
# ?FindClusters
# each time you run clustering, the data is stored in meta data columns:
# seurat_clusters - lastest results only
# snn_res.XX - for each different resolution you test.## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9511
## Number of communities: 6
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9102
## Number of communities: 7
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8604
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7970
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7453
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6996
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6594
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6219
## Number of communities: 16
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1022
## Number of edges: 29820
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.5879
## Number of communities: 17
## Elapsed time: 0 seconds
Plot the clustering results
wrap_plots(
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.1") + ggtitle("louvain_0.1"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.25") + ggtitle("louvain_0.25"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.0.5") + ggtitle("louvain_0.5"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.1") + ggtitle("louvain_1"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.1.5") + ggtitle("louvain_1.5"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.2") + ggtitle("louvain_2"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.2.5") + ggtitle("louvain_2.5"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.3") + ggtitle("louvain_3"),
DimPlot(data.filt, reduction = "umap", group.by = "RNA_snn_res.3.5") + ggtitle("louvain_3.5"),
ncol = 3
)Which clustering resolution give a result that mostly agrees with out annotation? It’s hard to see! It looks that resolution 0.1 holds the highest similarity. Let’s check the quality of the cells first:
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb", "percent_plat")
FeaturePlot(data.filt, features=feats, ncol=3)Nothing obviously wrong!
What should we do? which one to choose! I recommend that at this stage, talk to your PI, and ask which version make the most biological sense.
During this process, try to get the differential genes for two or three sets of clusters you feel hard to choose from, and compare the differential genes and see with the help of more marker genes from the differential tables, whether you can decide on one that makes the most sense.
For demonstration, here, we will identify the differential genes of the label transfer-predicted cell types
From the differential gene results, we can check whether the marker genes for a particular celltype will appear in the differentially expressed gene list for the corresponding cluster.
One can also use differential analysis for exploring new marker genes.
Wilcoxon rank-sum is a non-parametrical differential analysis method. You might also consider much more powerful differential testing packages like MAST, LR, limma, DESeq2.
we will run FindAllMarkers to test one cluster vs the
rest, the largest celltype (T cell) will dominate the “rest” and
influence the results the most. So it is often a good idea to subsample
the clusters to an equal number of cells before running differential
expression for one vs the rest. So lets select 30 cells per cell type
here:
# Check cell number per celltype:
table(data.filt$predicted.id)
data.filt$predicted.id = factor(data.filt$predicted.id, levels=names(table(data.filt$predicted.id)))
# Identify differential genes for each celltype
Idents(data.filt) = data.filt$predicted.id
markers_genes <- FindAllMarkers(
data.filt,
log2FC.threshold = 0.2,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = 0.2,
only.pos = TRUE,
max.cells.per.ident = 30, # downsample each cell type to 30
assay = "RNA",
min.cells.group = 1
)
# Run ?FindAllMarkers to adjust for parameters!
# Filter the marker genes
markers_genes = markers_genes %>% filter(p_val_adj<0.1)
# Check the acquired number of marker genes per cell type:
table(markers_genes$cluster)##
## B cell CD4 T cell CD8 T cell cDC cMono ncMono
## 202 76 523 4 93 30
## NK cell pDC Plasma cell
## 92 1 1
##
## B cell CD4 T cell CD8 T cell cDC cMono ncMono
## 28 6 27 142 107 221
## NK cell pDC Plasma cell
## 12 289 168
Plot the top marker genes:
markers_genes %>%
group_by(cluster) %>%
slice_min(p_val_adj, n = 5, with_ties = FALSE) -> top5_sub
DotPlot(data.filt, features = rev(as.character(unique(top5_sub$gene))),
group.by = "predicted.id", assay = "RNA") + #coord_flip() +
theme(axis.text.x =
element_text(angle = 30, vjust = 1, hjust=1))Save the differential gene table into an excel file and now you can make some UMAP plots and take them to your PI and discuss.
write.xlsx(as.data.frame(markers_genes),
"results/DEGs_predicted.id.xlsx",
col.names=TRUE, row.names=FALSE)saveRDS(data.filt, "results/data.analyzed.RDS")sessionInfo()## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/ekol-yal/miniconda3/envs/seurat5/lib/libopenblasp-r0.3.21.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] xlsx_0.6.5 dplyr_1.1.0 patchwork_1.1.2 ggplot2_3.4.1
## [5] Seurat_5.0.1 SeuratObject_5.0.1 sp_1.6-0 captioner_2.2.3
## [9] bookdown_0.33 knitr_1.42
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.1-0
## [3] reticulate_1.28 tidyselect_1.2.0
## [5] htmlwidgets_1.6.1 grid_4.2.2
## [7] BiocParallel_1.32.5 Rtsne_0.16
## [9] pROC_1.18.0 munsell_0.5.0
## [11] ScaledMatrix_1.6.0 codetools_0.2-19
## [13] ica_1.0-3 future_1.31.0
## [15] miniUI_0.1.1.1 withr_2.5.0
## [17] spatstat.random_3.1-4 colorspace_2.1-0
## [19] progressr_0.13.0 Biobase_2.58.0
## [21] highr_0.10 rstudioapi_0.14
## [23] stats4_4.2.2 SingleCellExperiment_1.20.1
## [25] ROCR_1.0-11 tensor_1.5
## [27] rJava_1.0-6 listenv_0.9.0
## [29] MatrixGenerics_1.10.0 labeling_0.4.2
## [31] harmony_1.2.0 GenomeInfoDbData_1.2.9
## [33] polyclip_1.10-4 farver_2.1.1
## [35] parallelly_1.34.0 vctrs_0.5.2
## [37] generics_0.1.3 ipred_0.9-13
## [39] xfun_0.37 timechange_0.2.0
## [41] R6_2.5.1 GenomeInfoDb_1.34.9
## [43] ggbeeswarm_0.7.2 rsvd_1.0.5
## [45] bitops_1.0-7 spatstat.utils_3.0-2
## [47] cachem_1.0.7 DelayedArray_0.24.0
## [49] promises_1.2.0.1 scales_1.2.1
## [51] nnet_7.3-18 beeswarm_0.4.0
## [53] gtable_0.3.1 beachmat_2.14.2
## [55] globals_0.16.2 goftest_1.2-3
## [57] spam_2.10-0 timeDate_4022.108
## [59] rlang_1.0.6 splines_4.2.2
## [61] lazyeval_0.2.2 ModelMetrics_1.2.2.2
## [63] spatstat.geom_3.1-0 yaml_2.3.7
## [65] reshape2_1.4.4 abind_1.4-5
## [67] httpuv_1.6.9 caret_6.0-93
## [69] lava_1.7.2.1 tools_4.2.2
## [71] ellipsis_0.3.2 jquerylib_0.1.4
## [73] RColorBrewer_1.1-3 BiocGenerics_0.44.0
## [75] ggridges_0.5.4 Rcpp_1.0.10
## [77] plyr_1.8.8 sparseMatrixStats_1.10.0
## [79] zlibbioc_1.44.0 purrr_1.0.1
## [81] RCurl_1.98-1.10 rpart_4.1.19
## [83] deldir_1.0-6 pbapply_1.7-0
## [85] viridis_0.6.2 cowplot_1.1.1
## [87] S4Vectors_0.36.2 zoo_1.8-11
## [89] SummarizedExperiment_1.28.0 ggrepel_0.9.3
## [91] cluster_2.1.4 magrittr_2.0.3
## [93] data.table_1.14.8 RSpectra_0.16-1
## [95] scattermore_1.2 lmtest_0.9-40
## [97] RANN_2.6.1 fitdistrplus_1.1-11
## [99] matrixStats_0.63.0 xlsxjars_0.6.1
## [101] mime_0.12 evaluate_0.20
## [103] xtable_1.8-4 fastDummies_1.7.3
## [105] IRanges_2.32.0 gridExtra_2.3
## [107] compiler_4.2.2 scater_1.26.1
## [109] tibble_3.1.8 KernSmooth_2.23-20
## [111] htmltools_0.5.4 later_1.3.0
## [113] tidyr_1.3.0 lubridate_1.9.2
## [115] MASS_7.3-58.2 Matrix_1.6-5
## [117] cli_3.6.0 gower_1.0.1
## [119] parallel_4.2.2 dotCall64_1.1-1
## [121] igraph_1.4.1 GenomicRanges_1.50.2
## [123] pkgconfig_2.0.3 recipes_1.0.5
## [125] plotly_4.10.1 scuttle_1.8.4
## [127] spatstat.sparse_3.0-1 foreach_1.5.2
## [129] hardhat_1.2.0 vipor_0.4.7
## [131] bslib_0.4.2 XVector_0.38.0
## [133] prodlim_2019.11.13 stringr_1.5.0
## [135] digest_0.6.31 sctransform_0.4.1
## [137] RcppAnnoy_0.0.20 spatstat.data_3.0-1
## [139] rmarkdown_2.20 leiden_0.4.3
## [141] uwot_0.1.14 DelayedMatrixStats_1.20.0
## [143] shiny_1.7.4 lifecycle_1.0.3
## [145] nlme_3.1-162 jsonlite_1.8.4
## [147] BiocNeighbors_1.16.0 limma_3.54.2
## [149] viridisLite_0.4.1 fansi_1.0.4
## [151] pillar_1.8.1 lattice_0.20-45
## [153] fastmap_1.1.1 httr_1.4.5
## [155] survival_3.5-3 glue_1.6.2
## [157] png_0.1-8 iterators_1.0.14
## [159] presto_1.0.0 class_7.3-21
## [161] stringi_1.7.12 sass_0.4.5
## [163] RcppHNSW_0.4.1 scPred_1.9.2
## [165] BiocSingular_1.14.0 irlba_2.3.5.1
## [167] future.apply_1.10.0